Tesla Model 3 Discussion Forum Analysis

# plotting and pipes
library(tidyverse)
library(stringr)
library(tidyr)

# text mining library
library(tm)
library(tidytext)
library(wordcloud)
library(reshape2)
library(textstem)
library(ggraph)
library(igraph)
library(widyr)
library(spacyr)
library(SnowballC)
library(topicmodels)
library(quanteda)
library(seededlda)

# date/time library
library(lubridate)

Data Import, Wrangling, and Cleaning

# Read in the tesla forum data
df <- read.csv('tesla_forums.csv')
# Adjust variable types
df$Time <- as_datetime(df$Time)
df$User <- as.factor(df$User)
df$Topic <- as.factor(df$Topic)
# Drop a small amount of rows with NA values
df <- drop_na(df)
# Removed all duplicates.  The scraping method used created quite a few.
df <- distinct(df)
# Remove the first topic, it's just the "how to use the forums" thread and doesn't aid in analysis
df <- df[-c(1:24), ]
# Add Doc_Id incrementing per Row
df <- df %>%
  mutate(doc_id = paste0("doc", row_number())) %>%
  select(doc_id, everything())
# Add a Column for Text Length
df$text_len <- str_count(df$Discussion)

Examine the Structure

str(df)
## 'data.frame':    54311 obs. of  6 variables:
##  $ doc_id    : chr  "doc1" "doc2" "doc3" "doc4" ...
##  $ Topic     : Factor w/ 3676 levels "'Come to Me' won't highlight",..: 2394 1625 1625 1625 1625 3210 3210 3210 3210 3210 ...
##  $ Discussion: chr  "I have 2 questions...\n1. I reserved my Model 3 on 3/31 but BEFORE the unveil. Will this be this be taken into "| __truncated__ "Hello\nFor past couple of months, Iam noticing that my volume in the car needs to be way high for me to have a "| __truncated__ "Yes I have noticed this as well. Exactly as you described." "Agreed. It seemed to coincide with iOS 14, but that could just be what I associated it with." ...
##  $ User      : Factor w/ 6564 levels "_BOBO_","_Bullwinkle_",..: 6336 1680 78 3656 78 203 3672 2358 633 2576 ...
##  $ Time      : POSIXct, format: "2016-06-14 22:35:52" "2020-12-15 01:34:43" ...
##  $ text_len  : int  354 259 58 92 72 454 112 140 86 728 ...
df %>%
  select(Discussion, Time, text_len) %>%
  summary()
##   Discussion             Time                        text_len     
##  Length:54311       Min.   :2015-12-10 19:16:17   Min.   :   1.0  
##  Class :character   1st Qu.:2019-10-04 08:41:09   1st Qu.:  86.0  
##  Mode  :character   Median :2020-04-03 16:40:26   Median : 179.0  
##                     Mean   :2020-01-26 01:41:25   Mean   : 278.4  
##                     3rd Qu.:2020-07-29 17:58:40   3rd Qu.: 348.0  
##                     Max.   :2020-12-15 02:19:29   Max.   :7944.0

Observations * Discussions: There are a total of 54,311 discussion threads in this dataset after removing duplicates. This is essentially like a comment on a Facebook post. A Topic (not shown) is posted, and Discussions happen on those topics. * Time: Dates range from 2015-12-1o to 2020-12-15. The Median, Mean and 3rd Quartile are all in 2020 telling us that most of the dates in this set are in 2020. * Text_Len: Min length of text is 0 and max is 7,944 characters with a median of 179.0.

EDA

EDA, or Exploratory Data Analysis to better understand the characteristic, extents, and shape of our data.

# Make a copy of the original DF so it can be referenced later.
df_select <- df

Topic & User Information

The way that this data is stored is that for each discussion row, the topic title is repeated. Therefore we need to summarize the rows and aggregate them into counts for each unique topic. This way we can also see how many of discussions are

df_topics <- df_select %>%
  group_by(Topic) %>%
  summarise(count = n(), .groups="keep") %>%
  arrange(desc(count))
head(df_topics)
df_topics %>%
  ggplot(aes(count)) + 
  geom_histogram(fill="lightgray", color="gray", bins=30) +
  theme_minimal() +
  scale_y_log10() +
  labs(x = "Number of Discussions per Topic",
       y = "Count (Log10 Scale)",
      title = "Distributions of Discussions per Topic",
      subtitle = "Number of replies per unqiue thread"
      ) +
  theme(plot.title = element_text(face = "bold"))

Regarding the number of Discussions per Topic, a heavily right skewed distribution with a range of 500-1,000 total topics with 0-25 discussions each. After 25 or so (x-axis), there are just a few with greater than 25 replies per topic. There are two topics above 75, as noted in the table above.

df_users <- df_select %>% 
  group_by(User) %>%
  summarise(count = n(), .groups="keep") %>% 
  arrange(desc(count))
head(df_users, n=10)

The forums are quite active by various users. 8 users have over 1,000 posts in this dataset.

df_users %>%
  ggplot(aes(count)) + 
  geom_histogram(fill="lightgray", color="gray", bins=30) +
  theme_minimal() +
  scale_y_log10() +
  labs(x = "Number Posts",
       y = "Count (Log10 Scale)",
      title = "Distributions of Active Users",
      subtitle = "Number of unique entried per user name"
      ) +
  theme(plot.title = element_text(face = "bold"))

A large number of users have a very small number of posts, 1,500+. There are a small number that are extremely active on the forums having > 500 posts.

sprintf("There are %s unique topics", nrow(df_topics))
## [1] "There are 3676 unique topics"
sprintf("There are %s total discussion threads (replies)", nrow(df_select))
## [1] "There are 54311 total discussion threads (replies)"
sprintf("There are %s total unique users", nrow(df_users))
## [1] "There are 6548 total unique users"

Text Length

summary(df_select$text_len)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    86.0   179.0   278.4   348.0  7944.0

Discussion lengths for the dataset range from 0 characters to 7,944 with a median of 179 with a mean of 278.

df_select %>%
  ggplot(aes(text_len)) + 
  geom_histogram(fill="lightgray", color="gray", bins=30) +
  theme_minimal() +
  scale_y_log10() +
  labs(x = "Text Length",
       y = "Count (Log10 Scale)",
      title = "Distributions of Text Length",
      subtitle = "Per character counts of the replies to topics"
      ) +
  theme(plot.title = element_text(face = "bold"))

Text length for posts is right skewed as well with most posts being shorter in length. But there is a much more spread distribution towards the right tail.

plot_df_time <- df %>%
  mutate(date = floor_date(Time, "week")) %>%
  group_by(date) %>%
  summarize(count = n(), .groups='keep')


ggplot(plot_df_time, aes(date, count)) +
  geom_line(show.legend = FALSE) +
  labs(x = NULL, y = "") +
  theme_minimal()

When viewing the time frequency of the posts, the data does go back to 2016 but activity jumps at the start of 2020. Due to the way these were scraped from the Forums, staring with newest posts and working backwards, it should be the case that we are loaded more in the current year. There is a dip in posts around mid-2020, this most likely is a error with scraping, vs. lack of activity on the forum. For the sake of this analysis being focused mostly on text, it’s not critical to understand.

Outlier Analysis

df_select %>%
  filter(text_len > 5000) %>%
  select(Discussion) %>%
  head(n=1)

Note: Since this is discussion forum text, outliers are simply long posts as demonstrated above. They will remain in the dataset since longer text often contains valuable information.

Clean and Prepare the Text for Analysis

Clean The Text

df_select$Discussion <- iconv(df_select$Discussion, "latin1", "ASCII", sub = "")
df_select$Discussion <- str_replace_all(df_select$Discussion,"\\n","")
df_select$Discussion <- str_replace_all(df_select$Discussion,"@","")
df_select$Discussion <- gsub("http[[:alnum:][:punct:]]*", "", df_select$Discussion)

Remove Whitespace, Punctuation, Stopwords and Lemmatize

Standard text cleaning to normalize and finally to lemmatize the words to their root lemma, or base word. In this opration, we will not remove numbers, since we are focused on the Model 3, containing a number in it’s name later used for n-Gram creation.

df_select$Discussion = removePunctuation(df_select$Discussion)
df_select$Discussion = stripWhitespace(df_select$Discussion)
df_select$Discussion = tolower(df_select$Discussion)
df_select$Discussion = removeWords(df_select$Discussion, c(stopwords('english')))
df_select$Discussion = lemmatize_strings(df_select$Discussion)
head(df_select$Discussion)
## [1] "2 questions1 reserve model 3 331 unveil take consideration2 im drummer need know kit mainly kick drum go fit car seat fold etc continuous glass look like trunk open go pretty limitedthankswendy"                                                                    
## [2] "hellofor past couple month iam notice volume car need way high decent conversation iphone 11 bluetooth music stream good phone call sound like whisper volume anybody else notice"                                                                                    
## [3] "yes notice good exactly describe"                                                                                                                                                                                                                                     
## [4] "agree seem coincide ios 14 just associate"                                                                                                                                                                                                                            
## [5] "good android look like common denominator tesla"                                                                                                                                                                                                                      
## [6] "drive home today tire pressure signal come panel check psi indicator tire low part panel tire read 41 psi one 41s yellow color different other get home check tire indicate manually 41 just like panel indicate tire malfunction indicate something aware thank alan"

Unnest_Tokens()

Create a new column with each word on it’s own row.

tidy_df <- df_select %>%
  unnest_tokens(word, Discussion)

Validate the New number of Rows

Dramatically larger now that each word from text is in it’s own row.

nrow(tidy_df)
## [1] 1454047

Sentiment Analysis

Bing Sentiment Lexicon

Using the Bing Lexicon from Bing Liu and collaborators, adds the column “Sentiment” and mark each word as positive or negative.

https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

bing_df <- tidy_df %>%
  inner_join(get_sentiments("bing"), by = "word")
bing_df %>%
  group_by(sentiment) %>%
  summarise(count = n(), .groups = "keep")

AFINN scoring Lexicon

AFINN from Finn Årup Nielsen, adds the value column, with a numeric representation of how positive, or negative the word is. The AFINN lexicon measures sentiment with a numeric score between -5 and 5

http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html

afinn_df <- tidy_df %>%
  inner_join(get_sentiments("afinn"), by = "word")

head(afinn_df)
afinn_df %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 10, show.legend = FALSE, fill="lightgray", color="darkgray") +
  scale_x_continuous(breaks = c(-5, -3, -1, 1, 3, 5)) +
  theme_minimal() +
  scale_colour_grey(start = 0.3, end = .8)

For the dataset overall, there is a slight left-skew showing there is a greater concentration of words with positive values. There are very few in the high and low values (-4,-5, +5).

Note: 0 is not a valid value in this scoring system, therefore the bin is empty

NRC Sentiment Lexicon

NRC from Saif Mohammad and Peter Turney. The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions as well as positive and negative sentiment.

One thing to note, single words can have multiple emotions

nrc_df <- tidy_df %>%
  inner_join(get_sentiments("nrc"), by = "word")

Total counts for all 8 emotions and 2 sentiments.

nrc_df %>%
  group_by(sentiment) %>%
  summarise(total = n(), .groups = "keep") %>%
  arrange(desc(total))

Inspect Top Words Per Candidate

Using various methods, inspect what words are most frequently used, per candidate, proportions of negative and positive words, and trend over time.

Top Word Counts (BING)

bing_df %>%
  count(word, sort = TRUE, sentiment) %>%

  group_by(sentiment) %>%
  top_n(15) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free") +
  theme_minimal() +
  labs(x = "Contribution to sentiment",
       y = NULL)

Focusing on the Negative words, the top occurrence is issue and second is problem. Given the nature of this forumn, talking about a product, these are very practical words to be on the top of the list. People reporting or discussion issues and problems with their cars. Bug, Noise, Break, and Damage all feel like perfect matches as well. Numb is an interesting occurrence and worth looking into a little more.

Overall Top Words (BING)

bing_df %>%
  count(word, sort = TRUE, sentiment) %>%
  top_n(30) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = TRUE) +
  theme_minimal() +
  labs(x = "Contribution to sentiment", y = NULL)

Sentiment over Time (AFINN)

plot_df2 <- afinn_df %>%
  mutate(mon = floor_date(Time, "month")) %>%
  group_by(mon) %>%
  summarize(value = mean(value), .groups = 'keep')

plot_df2$color <- ifelse(plot_df2$value < 0, "negative","positive")

ggplot(plot_df2, aes(mon, value, fill = color)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Mean AFINN Sentiment Score") +
  theme_minimal()

Most Positive Messages

afinn_df %>%
  group_by(doc_id) %>%
  summarize(total_value = sum(value), word_count = n(), .groups = "keep") %>%
  arrange(desc(total_value)) %>%
  head()
df %>%
  filter(doc_id == "doc51919" | doc_id == "doc44774" | doc_id == "doc20019") %>%
  select(Discussion)

Most Negative Messages

afinn_df %>%
  group_by(doc_id) %>%
  summarize(total_value = sum(value), word_count = n(), .groups = "keep") %>%
  arrange(total_value) %>%
  head()
df %>%
  filter(doc_id == "doc390" | doc_id == "doc47992" | doc_id == "doc29552") %>%
  select(Discussion)

Word Cloud

Word cloud of the top 200 words grouped by sentiment, positive or negative.

bing_df %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray", "black"), max.words = 200)

n-Grams

Bi-Grams

bigrams <- df_select %>%
  unnest_tokens(bigram, Discussion, token = "ngrams", n = 2)
bigram_counts <- bigrams %>%
  count(bigram, sort = TRUE) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigram_counts <- drop_na(bigram_counts)
bigram_counts
bigram_graph <- bigram_counts %>%
  filter(n > 275) %>%
  graph_from_data_frame()

set.seed(2017)
a <- grid::arrow(type = "open", length = unit(.05, "inches"))

ggraph(bigram_graph, layout = "nicely") +
  geom_edge_link(arrow = a, end_cap = circle(.02, 'inches')) +
  geom_node_point(color = "gray", size = 2) +
  geom_node_text(aes(label = name), vjust = -1, hjust = 1) +
  theme_minimal()

Trigrams

trigrams <- df_select %>%
  unnest_tokens(bigram, Discussion, token = "ngrams", n = 3)
trigram_counts <- trigrams %>%
  count(bigram, sort = TRUE) %>%
  separate(bigram, c("word1", "word2", "word3"), sep = " ")

trigram_counts <- drop_na(trigram_counts)
trigram_counts
trigram_graph <- trigram_counts %>%
  filter(n > 50) %>%
  graph_from_data_frame()

set.seed(2017)
a <- grid::arrow(type = "open", length = unit(.05, "inches"))

ggraph(trigram_graph, layout = "nicely") +
  geom_edge_link(arrow = a, end_cap = circle(.02, 'inches')) +
  geom_node_point(color = "gray", size = 2) +
  geom_node_text(aes(label = name), vjust = 1.5, hjust = -.25) +
  theme_minimal()

Topic Modeling of Topics

Preprocessing

corpus <- Corpus(VectorSource(df_topics$Topic))

Before cleaning

inspect(corpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] My Model 3 experience and Future Tesla Outlook                             
## [2] Model 3 Battery Degradation 6%, 14 months, 37,000 miles                    
## [3] “Tesla Model 3 Rear Bumper Falling Off Issue More Widespread Than Expected”
## [4] Model S vs. Model 3.                                                       
## [5] Bjorn Nyland Degradation 6% first year, 37,000 miles

Remove Words that Aren’t Helpful for Topic Modeling

Custom list of words generated upon performing the topic modeling. These were frequently appeared but do not add a lot of context to topic identification, or appear so frequently, such as University, that they are in every topic.

corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)  # remove punctuation
corpus <- tm_map(corpus, stripWhitespace)    # remove white space
corpus <- tm_map(corpus, removeWords, c(stopwords('english')))
corpus <- tm_map(corpus, lemmatize_strings) # lemmatizaton
# Manually remove odd characters that frequently appear
corpus <- tm_map(corpus,content_transformer(function(x) gsub("“", " ", x)))
corpus <- tm_map(corpus, content_transformer(function(x) gsub("”", " ", x)))
corpus <- tm_map(corpus, content_transformer(function(x) gsub("’", " ", x)))

corpus <- tm_map(corpus, removeWords, c("get", "much", "can", 
                                        "will", "say", "car", "may", "use",
                                        "just", "one", "good", "like", "think",
                                        "model", "tesla", "anyone", "god", 
                                        "2020", "2021"))
inspect(corpus[1:6])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 6
## 
## [1]  3 experience future  outlook                   
## [2]  3 battery degradation 6 14 month 37000 mile    
## [3]     3 rear bumper fall issue widespread expect  
## [4]  s vs  3                                        
## [5] bjorn nyland degradation 6 first year 37000 mile
## [6] floor mat

Document Term Matrix Creation

dtm <- DocumentTermMatrix(corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 3676, terms: 3290)>>
## Non-/sparse entries: 13769/12080271
## Sparsity           : 100%
## Maximal term length: 32
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   app battery charge drive issue new range tire update work
##   1173   0       0      0     1     0   0     0    0      0    0
##   1518   0       0      0     0     0   0     0    0      0    0
##   1629   0       0      0     0     0   0     0    0      0    1
##   1708   0       0      0     0     0   0     0    0      0    0
##   1785   0       0      0     0     0   0     0    0      0    0
##   2533   0       0      0     0     0   0     0    0      0    0
##   3012   0       0      0     0     0   0     0    0      0    0
##   3290   1       0      0     0     0   0     0    0      0    0
##   3321   0       0      0     0     0   0     0    0      0    1
##   3348   0       0      0     0     0   0     0    0      0    0

Remove Sparse Terms

dtm = removeSparseTerms(dtm, .995)
inspect(dtm)
## <<DocumentTermMatrix (documents: 3676, terms: 135)>>
## Non-/sparse entries: 5161/491099
## Sparsity           : 99%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   app battery charge drive issue new range tire update work
##   1365   1       0      0     0     0   0     0    0      0    0
##   1639   0       0      0     0     1   0     0    0      0    0
##   2386   0       0      0     0     0   0     0    0      0    1
##   2830   0       0      0     0     0   0     0    0      0    0
##   2866   0       0      0     0     0   0     0    0      0    0
##   2998   0       0      0     0     0   0     0    0      0    0
##   3042   0       0      0     0     0   1     0    0      0    0
##   3051   1       0      1     0     0   0     0    0      0    0
##   3327   0       0      2     0     0   0     0    0      0    0
##   724    0       0      0     0     0   0     0    0      0    0
sel_idx <- rowSums(as.matrix(dtm)) > 0
dtm <- dtm[sel_idx, ]
dim(dtm)
## [1] 2780  135

LDA

Choosing the Best k-Value

mat <- as.matrix (weightTfIdf(dtm))

# normalize the TfIdf scores by euclidean distance.
scaled_data  <- dist(mat, method = "euclidean")

k.max <- 10
data <- scaled_data
wss <- sapply(1:k.max,
              function(k) {
                kmeans(data, k, nstart = 50, iter.max = 10)$tot.withinss
              })

plot(
  1:k.max,
  wss,
  type = "b",
  pch = 19,
  frame = FALSE,
  xlab = "Number of clusters K",
  ylab = "Total within-clusters sum of squares"
)

lda <- LDA(dtm, k = 4, control = list(seed = 1234))
lda
## A LDA_VEM topic model with 4 topics.

Per-Topic-Per-Word Probabilities (Beta)

# beta (per-term-per-topic) 
topics <- tidy(lda, matrix = "beta")

Top-Level Topics

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(7, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free", ncol=2) +
    theme_minimal(base_size = 28) + 
    scale_y_reordered()

Topic Models with Quanteda

my_corpus <- corpus(df_select$Discussion)  # build a new corpus from the texts
quant_dfm <- dfm(my_corpus, 
                remove_punct = TRUE, 
                remove_numbers = TRUE, 
                remove = stopwords("english"))
quant_dfm <- dfm_trim(quant_dfm, min_termfreq = 4, max_docfreq = 10)
quant_dfm
## Document-feature matrix of: 54,311 documents, 5,534 features (100.0% sparse).
##        features
## docs    unveil whisper coincide 125k itits hubbys watthours amortize greenville
##   text1      1       0        0    0     0      0         0        0          0
##   text2      0       1        0    0     0      0         0        0          0
##   text3      0       0        0    0     0      0         0        0          0
##   text4      0       0        1    0     0      0         0        0          0
##   text5      0       0        0    0     0      0         0        0          0
##   text6      0       0        0    0     0      0         0        0          0
##        features
## docs    nefarious
##   text1         0
##   text2         0
##   text3         0
##   text4         0
##   text5         0
##   text6         0
## [ reached max_ndoc ... 54,305 more documents, reached max_nfeat ... 5,524 more features ]
set.seed(100)
if (require(stm)) {
    my_lda_fit20 <- stm(quant_dfm, K = 20, verbose = FALSE)
    plot(my_lda_fit20)    
}
## Loading required package: stm
## stm v1.3.6 successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com
## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## text3, text5, text6, text7, text8, text9, text10, text11, text13, text14,
## text15, text16, text17, text20, text21, text22, text23, text24, text25, text27,
## text29, text30, text32, text34, text35, text41, text42, text43, text45, text46,
## text47, text48, text50, text51, text52, text53, text54, text55, text56, text58,
## text60, text62, text63, text64, text65, text70, text71, text72, text75, text76,
## text77, text78, text79, text80, text81, text82, text83, text84, text85, text86,
## text87, text88, text89, text90, text91, text92, text93, text95, text98, text101,
## text110, text113, text115, text116, text118, text120, text121, text122, text123,
## text124, text126, text127, text128, text129, text133, text136, text137, text138,
## text139, text140, text142, text145, text147, text149, text150, text151, text153,
## text155, text157, text158, text160, text162, text165, text166, text177, text178,
## text179, text180, text181, text182, text183, text184, text185, text186, text189,
## text190, text191, text192, text193, text195, text198, text200, text202, text203,
## text204, text205, text207, text208, text210, text211, text212, text215, text217,
## text219, text220, text221, text224, text225, text226, text227, text228, text229,
## text230, text231, text233, text235, text237, text238, text239, text240, text241,
## text242, text244, text245, text246, text247, text249, text250, text253, text256,
## text258, text259, text260, text261, text263, text265, text266, text267, text268,
## text269, text270, text273, text274, text275, text276, text277, text280, text281,
## text282, text284, text285, text286, text290, text292, text293, text295, text297,
## text298, text299, text300, text301, text302, text303, text304, text306, text307,
## text309, text310, text311, text315, text319, text320, text321, text322, text323,
## text324, text326, text327, text328, text329, text330, text333, text334, text335,
## text336, text337, text339, text340, text342, text343, text344, text345, text347,
## text348, text351, text352, text353, text354, text358, text359, text360, text364,
## text367, text371, text377, text378, text380, text381, text382, text383, text385,
## text386, text389, text391, text392, text393, text394, text395, text398, text399,
## text400, text403, text404, text405, text409, text412, text415, text416, text417,
## text419, text420, text421, text423, text424, text426, text428, text429, text431,
## text433, text435, text437, text438, text440, text441, text442, text443, text444,
## text445, text446, text447, text449, text450, text454, text456, text459, text461,
## text463, text464, text465, text466, text467, text470, text471, text473, text474,
## text475, text476, text477, text478, text479, text483, text485, text486, text487,
## text488, text489, text490, text491, text492, text493, text494, text495, text499,
## text501, text502, text505, text507, text508, text510, text514, text516, text517,
## text518, text519, text520, text521, text522, text523, text524, text526, text528,
## text529, text530, text531, text532, text533, text534, text536, text537, text538,
## text539, text541, text542, text543, text545, text546, text547, text548, text549,
## text550, text551, text552, text553, text554, text555, text556, text557, text559,
## text561, text563, text571, text572, text573, text575, text576, text579, text580,
## text581, text582, text584, text585, text586, text587, text589, text590, text591,
## text594, text595, text597, text598, text599, text600, text601, text602, text605,
## text606, text607, text608, text609, text610, text612, text614, text616, text617,
## text618, text619, text620, text621, text622, text623, text625, text626, text628,
## text629, text630, text633, text634, text635, text636, text637, text638, text639,
## text640, text641, text642, text643, text644, text645, text646, text647, text648,
## text649, text651, text652, text653, text654, text655, text656, text657, text658,
## text659, text661, text662, text663, text664, text665, text668, text669, text670,
## text671, text672, text675, text677, text678, text679, text680, text681, text682,
## text683, text685, text686, text687, text690, text691, text694, text695, text696,
## text697, text701, text705, text706, text707, text711, text716, text717, text718,
## text720, text721, text722, text723, text724, text726, text727, text728, text729,
## text730, text731, text732, text735, text736, text738, text739, text742, text743,
## text744, text745, text748, text750, text752, text754, text755, text760, text762,
## text763, text764, text765, text766, text767, text768, text769, text770, text771,
## text772, text773, text774, text776, text777, text778, text780, text781, text782,
## text783, text784, text785, text787, text788, text790, text791, text795, text796,
## text797, text798, text799, text800, text803, text804, text805, text809, text813,
## text814, text815, text816, text817, text818, text819, text820, text824, text826,
## text829, text830, text831, text832, text835, text837, text838, text839, text840,
## text841, text843, text848, text850, text852, text853, text855, text856, text858,
## text860, text861, text863, text864, text866, text868, text869, text872, text873,
## text874, text882, text883, text886, text887, text888, text890, text894, text896,
## text897, text902, text903, text904, text908, text910, text911, text912, text913,
## text914, text915, text916, text917, text918, text919, text920, text921, text923,
## text924, text925, text928, text929, text930, text932, text933, text937, text938,
## text939, text940, text941, text943, text946, text947, text948, text950, text951,
## text956, text957, text959, text960, text962, text963, text964, text965, text966,
## text967, text968, text969, text970, text972, text973, text974, text975, text976,
## text977, text978, text979, text980, text981, text983, text984, text985, text986,
## text989, text990, text991, text992, text994, text995, text996, text1001,
## text1002, text1003, text1004, text1005, text1006, text1007, text1008, text1009,
## text1010, text1011, text1012, text1013, text1014, text1017, text1018, text1019,
## text1020, text1021, text1026, text1027, text1028, text1029, text1030, text1031,
## text1035, text1036, text1038, text1040, text1042, text1044, text1045, text1046,
## text1047, text1048, text1049, text1050, text1052, text1053, text1054, text1055,
## text1059, text1060, text1061, text1062, text1063, text1066, text1067, text1069,
## text1072, text1073, text1074, text1076, text1080, text1083, text1084, text1085,
## text1086, text1087, text1089, text1090, text1091, text1092, text1093, text1094,
## text1095, text1096, text1097, text1101, text1102, text1104, text1105, text1107,
## text1109, text1111, text1112, text1113, text1115, text1116, text1117, text1118,
## text1119, text1120, text1121, text1122, text1123, text1124, text1125, text1126,
## text1127, text1129, text1132, text1133, text1134, text1136, text1138, text1139,
## text1141, text1144, text1146, text1147, text1148, text1149, text1151, text1152,
## text1153, text1154, text1155, text1156, text1158, text1159, text1161, text1162,
## text1165, text1166, text1168, text1171, text1172, text1175, text1176, text1177,
## text1178, text1179, text1180, text1181, text1182, text1183, text1187, text1188,
## text1191, text1192, text1194, text1195, text1203, text1205, text1206, text1207,
## text1211, text1215, text1218, text1219, text1220, text1223, text1224, text1225,
## text1231, text1236, text1238, text1240, text1241, text1244, text1246, text1247,
## text1248, text1249, text1250, text1251, text1252, text1253, text1257, text1259,
## text1262, text1263, text1264, text1265, text1266, text1267, text1268, text1269,
## text1270, text1273, text1274, text1276, text1278, text1279, text1280, text1281,
## text1283, text1284, text1285, text1288, text1289, text1290, text1291, text1292,
## text1295, text1296, text1299, text1301, text1302, text1307, text1308, text1311,
## text1313, text1316, text1318, text1319, text1320, text1321, text1324, text1326,
## text1327, text1328, text1329, text1331, text1332, text1335, text1336, text1338,
## text1345, text1347, text1348, text1349, text1350, text1351, text1353, text1354,
## text1355, text1356, text1357, text1360, text1363, text1365, text1368, text1369,
## text1370, text1372, text1373, text1374, text1376, text1377, text1378, text1379,
## text1380, text1382, text1383, text1384, text1385, text1387, text1388, text1390,
## text1391, text1392, text1393, te

tmod_lda <- textmodel_lda(quant_dfm, k = 10)
terms(tmod_lda, 10)
##       topic1         topic2        topic3      topic4       
##  [1,] "supremacist"  "kbp"         "mocph"     "flagjan"    
##  [2,] "ps4s"         "transistor"  "whr"       "quicktime"  
##  [3,] "vermont"      "acute"       "v2v"       "levi1994"   
##  [4,] "salvage"      "mhz"         "burden"    "lancaster"  
##  [5,] "d"            "transformer" "pv"        "fargo"      
##  [6,] "mercedesbenz" "m3d"         "x5"        "2a"         
##  [7,] "misuse"       "7kw"         "loon"      "rival"      
##  [8,] "japan"        "tweeter"     "handshake" "handler"    
##  [9,] "seasucker"    "twolane"     "oneplus"   "60amp"      
## [10,] "100kwh"       "vids"        "seize"     "contaminate"
##       topic5              topic6        topic7      topic8        topic9       
##  [1,] "provision"         "condenser"   "ohmmu"     "encryption"  "l1"         
##  [2,] "1hr"               "uvc"         "un"        "demographic" "cu"         
##  [3,] "bid"               "nn"          "skid"      "iihs"        "immersive"  
##  [4,] "tb"                "acc"         "hydrogen"  "coefficient" "probability"
##  [5,] "edison"            "ozone"       "goodyear"  "windscreen"  "amg"        
##  [6,] "analog"            "gdo"         "hankook"   "cello"       "voicemail"  
##  [7,] "fiscal"            "cotton"      "buffet"    "slant"       "nashville"  
##  [8,] "turo"              "institution" "polarize"  "tsb"         "15th"       
##  [9,] "cussenjim98521400" "refill"      "200pm"     "applicant"   "anker"      
## [10,] "tom"               "tar"         "assurance" "vlc"         "generosity" 
##       topic10       
##  [1,] "24ghz"       
##  [2,] "ferry"       
##  [3,] "eero"        
##  [4,] "powell"      
##  [5,] "impedance"   
##  [6,] "assert"      
##  [7,] "kg"          
##  [8,] "jurisdiction"
##  [9,] "chargeport"  
## [10,] "selector"
dict_topic <- dictionary(file = "tesla_topics.yml")
## Warning in readLines(con): incomplete final line found on 'tesla_topics.yml'
tmod_slda <- textmodel_seededlda(quant_dfm, dictionary = dict_topic)
terms(tmod_slda, 20)
##       battery           app            fsd            purchase     
##  [1,] "mileshr"         "condenser"    "nn"           "cu"         
##  [2,] "chargeport"      "quicktime"    "gdo"          "ps4s"       
##  [3,] "mileswhite"      "uvc"          "1hr"          "supremacist"
##  [4,] "milesday"        "provision"    "buffet"       "downpayment"
##  [5,] "mileskwh"        "ohmmu"        "reengage"     "hankook"    
##  [6,] "chargedischarge" "ozone"        "screech"      "lancaster"  
##  [7,] "chargepoints"    "composite"    "tub"          "institution"
##  [8,] "milestone"       "brooklyn"     "m3d"          "hydrogen"   
##  [9,] "chargingi"       "cotton"       "longdistance" "twolane"    
## [10,] "mileshour"       "mercedesbenz" "seize"        "fargo"      
## [11,] "chargei"         "tar"          "monterey"     "goodyear"   
## [12,] "chargingbjorn"   "supplemental" "200pm"        "tb"         
## [13,] "chargerstesla"   "applicant"    "kb"           "kg"         
## [14,] "milesmy"         "vlc"          "inspire"      "blvd"       
## [15,] "milescharge"     "d"            "breakfast"    "assurance"  
## [16,] "chargingit"      "loon"         "autobahn"     "patsy"      
## [17,] "milesit"         "skid"         "autoclose"    "tom"        
## [18,] "chargeri"        "sanitize"     "benchmark"    "gtfo"       
## [19,] "milesand"        "lubricant"    "scent"        "fascinate"  
## [20,] "milestesla"      "handshake"    "cease"        "31k"        
##       sentry        home            interior   
##  [1,] "kbp"         "mocph"         "l1"       
##  [2,] "flagjan"     "whr"           "immersive"
##  [3,] "levi1994"    "probability"   "v2v"      
##  [4,] "emulate"     "demographic"   "salvage"  
##  [5,] "mhz"         "x5"            "un"       
##  [6,] "uncle"       "transistor"    "iihs"     
##  [7,] "assert"      "vermont"       "refill"   
##  [8,] "coefficient" "2a"            "leakage"  
##  [9,] "60amp"       "bid"           "amg"      
## [10,] "windscreen"  "oneplus"       "bead"     
## [11,] "greece"      "handler"       "latency"  
## [12,] "7kw"         "transformer"   "theme"    
## [13,] "decimal"     "polarize"      "retard"   
## [14,] "foreground"  "disengagement" "convex"   
## [15,] "waffle"      "sacramento"    "tuner"    
## [16,] "ccs2"        "incompatible"  "nv"       
## [17,] "generosity"  "kinetic"       "postpone" 
## [18,] "p7"          "canbus"        "voicemail"
## [19,] "mihr"        "slant"         "cello"    
## [20,] "readout"     "thermometer"   "nashville"

Targeted Dictionary Analysis

# Reduce the columns to just what's needed
quant_tesla <- select(df_select, doc_id, Discussion, User, Time)

# Quanteda requires the text field to be called "text"
quant_tesla <- quant_tesla %>%
  rename(text = Discussion)

# Create the Corpus
corp_tesla <- corpus(quant_tesla)

# Add columns for Year, Month, and Week Number
corp_tesla$year <- year(corp_tesla$Time)
corp_tesla$month <- month(corp_tesla$Time)
corp_tesla$week <- week(corp_tesla$Time)

# Subset the Corpus for Just 2020
corp_tesla <- corpus_subset(corp_tesla, "year" >= 2020)
toks_tesla <- quanteda::tokens(corp_tesla, remove_punct = TRUE)

Full Self Driving

# get relevant keywords and phrases
fsd <- c("fsd", "self driving", "autopilot")

# only keep tokens specified above and their context of ±10 tokens
toks_fsd <- tokens_keep(toks_tesla, pattern = phrase(fsd), window = 10)

toks_fsd <- tokens_lookup(toks_fsd, dictionary = data_dictionary_LSD2015[1:2])

# create a document document-feature matrix and group it by weeks in 2016
dfmat_fsd_lsd <- dfm(toks_fsd) %>% 
    dfm_group(group = "week", fill = TRUE) 

matplot(dfmat_fsd_lsd, type = "l", xaxt = "n", lty = 1, ylab = "Frequency", 
        main = "Sentiment of Self-Driving/Full Self Driving for 2020")
grid()
axis(1, seq_len(ndoc(dfmat_fsd_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_fsd_lsd)) - 1))
legend("topleft", col = 1:2, legend = c("Negative", "Positive"), lty = 1, bg = "white")

n_fsd <- ntoken(dfm(toks_fsd, group = toks_fsd$week))
plot((dfmat_fsd_lsd[,2] - dfmat_fsd_lsd[,1]) / n_fsd, 
     type = "l", ylab = "Sentiment", xlab = "", xaxt = "n",
     main = "Sentiment of Self-Driving/Full Self Driving for 2020")
axis(1, seq_len(ndoc(dfmat_fsd_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_fsd_lsd)) - 1))
grid()
abline(h = 0, lty = 2)

### Battery Related

# get relevant keywords and phrases
bat <- c("battery", "charge", "range")

# only keep tokens specified above and their context of ±10 tokens
toks_bat <- tokens_keep(toks_tesla, pattern = phrase(bat), window = 10)

toks_bat <- tokens_lookup(toks_bat, dictionary = data_dictionary_LSD2015[1:2])

# create a document document-feature matrix and group it by weeks in 2016
dfmat_bat_lsd <- dfm(toks_bat) %>% 
    dfm_group(group = "week", fill = TRUE) 

matplot(dfmat_bat_lsd, type = "l", xaxt = "n", lty = 1, ylab = "Frequency",
        main = "Sentiment of Battery/Charging/Range for 2020")
grid()
axis(1, seq_len(ndoc(dfmat_bat_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_bat_lsd)) - 1))
legend("topleft", col = 1:2, legend = c("Negative", "Positive"), lty = 1, bg = "white")

n_bat <- ntoken(dfm(toks_bat, group = toks_bat$week))
plot((dfmat_bat_lsd[,2] - dfmat_bat_lsd[,1]) / n_bat, 
     type = "l", ylab = "Sentiment", xlab = "", xaxt = "n",
     main = "Sentiment of Battery/Charging/Range for 2020")
axis(1, seq_len(ndoc(dfmat_bat_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_bat_lsd)) - 1))
grid()
abline(h = 0, lty = 2)

Software Updates

# get relevant keywords and phrases
sw <- c("software", "update")

# only keep tokens specified above and their context of ±10 tokens
toks_sw <- tokens_keep(toks_tesla, pattern = phrase(sw), window = 10)

toks_sw <- tokens_lookup(toks_sw, dictionary = data_dictionary_LSD2015[1:2])

# create a document document-feature matrix and group it by weeks in 2016
dfmat_sw_lsd <- dfm(toks_sw) %>% 
    dfm_group(group = "week", fill = TRUE) 

matplot(dfmat_sw_lsd, type = "l", xaxt = "n", lty = 1, ylab = "Frequency",
        main = "Sentiment of Software Updates for 2020")
grid()
axis(1, seq_len(ndoc(dfmat_sw_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_sw_lsd)) - 1))
legend("topleft", col = 1:2, legend = c("Negative", "Positive"), lty = 1, bg = "white")

n_sw <- ntoken(dfm(toks_sw, group = toks_sw$week))
plot((dfmat_sw_lsd[,2] - dfmat_sw_lsd[,1]) / n_sw, 
     type = "l", ylab = "Sentiment", xlab = "", xaxt = "n",
     main = "Sentiment of Software Updates for 2020")
axis(1, seq_len(ndoc(dfmat_sw_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_sw_lsd)) - 1))
grid()
abline(h = 0, lty = 2)

Purchase Process

# get relevant keywords and phrases
own <- c("delivery", "purchase", "owner")

# only keep tokens specified above and their context of ±10 tokens
toks_own <- tokens_keep(toks_tesla, pattern = phrase(own), window = 10)

toks_own <- tokens_lookup(toks_own, dictionary = data_dictionary_LSD2015[1:2])

# create a document document-feature matrix and group it by weeks in 2016
dfmat_own_lsd <- dfm(toks_own) %>% 
    dfm_group(group = "week", fill = TRUE) 

matplot(dfmat_own_lsd, type = "l", xaxt = "n", lty = 1, ylab = "Frequency",
        main = "Sentiment of Purchasing Process for 2020")
grid()
axis(1, seq_len(ndoc(dfmat_own_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_own_lsd)) - 1))
legend("topleft", col = 1:2, legend = c("Negative", "Positive"), lty = 1, bg = "white")

n_own <- ntoken(dfm(toks_own, group = toks_own$week))
plot((dfmat_own_lsd[,2] - dfmat_own_lsd[,1]) / n_own, 
     type = "l", ylab = "Sentiment", xlab = "", xaxt = "n",
     main = "Sentiment of Purchasing Process for 2020")
axis(1, seq_len(ndoc(dfmat_own_lsd)), ymd("2020-01-01") + weeks(seq_len(ndoc(dfmat_own_lsd)) - 1))
grid()
abline(h = 0, lty = 2)